In [87]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report, confusion_matrix,r2_score
In [88]:
df = pd.read_csv("AirQualityUCI.csv",sep=";",decimal=",",low_memory=False)
df.head()   
Out[88]:
Date Time CO(GT) PT08.S1(CO) NMHC(GT) C6H6(GT) PT08.S2(NMHC) NOx(GT) PT08.S3(NOx) NO2(GT) PT08.S4(NO2) PT08.S5(O3) T RH AH Unnamed: 15 Unnamed: 16
0 10/03/2004 18.00.00 2.6 1360.0 150.0 11.9 1046.0 166.0 1056.0 113.0 1692.0 1268.0 13.6 48.9 0.7578 NaN NaN
1 10/03/2004 19.00.00 2.0 1292.0 112.0 9.4 955.0 103.0 1174.0 92.0 1559.0 972.0 13.3 47.7 0.7255 NaN NaN
2 10/03/2004 20.00.00 2.2 1402.0 88.0 9.0 939.0 131.0 1140.0 114.0 1555.0 1074.0 11.9 54.0 0.7502 NaN NaN
3 10/03/2004 21.00.00 2.2 1376.0 80.0 9.2 948.0 172.0 1092.0 122.0 1584.0 1203.0 11.0 60.0 0.7867 NaN NaN
4 10/03/2004 22.00.00 1.6 1272.0 51.0 6.5 836.0 131.0 1205.0 116.0 1490.0 1110.0 11.2 59.6 0.7888 NaN NaN
In [89]:
df.shape
Out[89]:
(9471, 17)
In [90]:
df.describe()
Out[90]:
CO(GT) PT08.S1(CO) NMHC(GT) C6H6(GT) PT08.S2(NMHC) NOx(GT) PT08.S3(NOx) NO2(GT) PT08.S4(NO2) PT08.S5(O3) T RH AH Unnamed: 15 Unnamed: 16
count 9357.000000 9357.000000 9357.000000 9357.000000 9357.000000 9357.000000 9357.000000 9357.000000 9357.000000 9357.000000 9357.000000 9357.000000 9357.000000 0.0 0.0
mean -34.207524 1048.990061 -159.090093 1.865683 894.595276 168.616971 794.990168 58.148873 1391.479641 975.072032 9.778305 39.485380 -6.837604 NaN NaN
std 77.657170 329.832710 139.789093 41.380206 342.333252 257.433866 321.993552 126.940455 467.210125 456.938184 43.203623 51.216145 38.976670 NaN NaN
min -200.000000 -200.000000 -200.000000 -200.000000 -200.000000 -200.000000 -200.000000 -200.000000 -200.000000 -200.000000 -200.000000 -200.000000 -200.000000 NaN NaN
25% 0.600000 921.000000 -200.000000 4.000000 711.000000 50.000000 637.000000 53.000000 1185.000000 700.000000 10.900000 34.100000 0.692300 NaN NaN
50% 1.500000 1053.000000 -200.000000 7.900000 895.000000 141.000000 794.000000 96.000000 1446.000000 942.000000 17.200000 48.600000 0.976800 NaN NaN
75% 2.600000 1221.000000 -200.000000 13.600000 1105.000000 284.000000 960.000000 133.000000 1662.000000 1255.000000 24.100000 61.900000 1.296200 NaN NaN
max 11.900000 2040.000000 1189.000000 63.700000 2214.000000 1479.000000 2683.000000 340.000000 2775.000000 2523.000000 44.600000 88.700000 2.231000 NaN NaN
In [91]:
df = df.replace(-200,np.nan)
In [92]:
df.describe()
Out[92]:
CO(GT) PT08.S1(CO) NMHC(GT) C6H6(GT) PT08.S2(NMHC) NOx(GT) PT08.S3(NOx) NO2(GT) PT08.S4(NO2) PT08.S5(O3) T RH AH Unnamed: 15 Unnamed: 16
count 7674.000000 8991.000000 914.000000 8991.000000 8991.000000 7718.000000 8991.000000 7715.000000 8991.000000 8991.000000 8991.000000 8991.000000 8991.000000 0.0 0.0
mean 2.152750 1099.833166 218.811816 10.083105 939.153376 246.896735 835.493605 113.091251 1456.264598 1022.906128 18.317829 49.234201 1.025530 NaN NaN
std 1.453252 217.080037 204.459921 7.449820 266.831429 212.979168 256.817320 48.370108 346.206794 398.484288 8.832116 17.316892 0.403813 NaN NaN
min 0.100000 647.000000 7.000000 0.100000 383.000000 2.000000 322.000000 2.000000 551.000000 221.000000 -1.900000 9.200000 0.184700 NaN NaN
25% 1.100000 937.000000 67.000000 4.400000 734.500000 98.000000 658.000000 78.000000 1227.000000 731.500000 11.800000 35.800000 0.736800 NaN NaN
50% 1.800000 1063.000000 150.000000 8.200000 909.000000 180.000000 806.000000 109.000000 1463.000000 963.000000 17.800000 49.600000 0.995400 NaN NaN
75% 2.900000 1231.000000 297.000000 14.000000 1116.000000 326.000000 969.500000 142.000000 1674.000000 1273.500000 24.400000 62.500000 1.313700 NaN NaN
max 11.900000 2040.000000 1189.000000 63.700000 2214.000000 1479.000000 2683.000000 340.000000 2775.000000 2523.000000 44.600000 88.700000 2.231000 NaN NaN
In [93]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9471 entries, 0 to 9470
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           9357 non-null   object 
 1   Time           9357 non-null   object 
 2   CO(GT)         7674 non-null   float64
 3   PT08.S1(CO)    8991 non-null   float64
 4   NMHC(GT)       914 non-null    float64
 5   C6H6(GT)       8991 non-null   float64
 6   PT08.S2(NMHC)  8991 non-null   float64
 7   NOx(GT)        7718 non-null   float64
 8   PT08.S3(NOx)   8991 non-null   float64
 9   NO2(GT)        7715 non-null   float64
 10  PT08.S4(NO2)   8991 non-null   float64
 11  PT08.S5(O3)    8991 non-null   float64
 12  T              8991 non-null   float64
 13  RH             8991 non-null   float64
 14  AH             8991 non-null   float64
 15  Unnamed: 15    0 non-null      float64
 16  Unnamed: 16    0 non-null      float64
dtypes: float64(15), object(2)
memory usage: 1.2+ MB
In [94]:
df = df.drop(columns=["Unnamed: 15","Unnamed: 16"])
df = df.dropna()
print(df.info())
print(df.shape)
<class 'pandas.core.frame.DataFrame'>
Index: 827 entries, 0 to 1230
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           827 non-null    object 
 1   Time           827 non-null    object 
 2   CO(GT)         827 non-null    float64
 3   PT08.S1(CO)    827 non-null    float64
 4   NMHC(GT)       827 non-null    float64
 5   C6H6(GT)       827 non-null    float64
 6   PT08.S2(NMHC)  827 non-null    float64
 7   NOx(GT)        827 non-null    float64
 8   PT08.S3(NOx)   827 non-null    float64
 9   NO2(GT)        827 non-null    float64
 10  PT08.S4(NO2)   827 non-null    float64
 11  PT08.S5(O3)    827 non-null    float64
 12  T              827 non-null    float64
 13  RH             827 non-null    float64
 14  AH             827 non-null    float64
dtypes: float64(13), object(2)
memory usage: 103.4+ KB
None
(827, 15)
In [95]:
sns.pairplot(df)
plt.show()
No description has been provided for this image
In [96]:
x = df[['T', 'RH', 'AH', 
        'PT08.S1(CO)', 'PT08.S2(NMHC)', 
        'PT08.S3(NOx)', 'PT08.S4(NO2)', 'PT08.S5(O3)']]

y = df[['CO(GT)', 'NO2(GT)', 'NMHC(GT)']]
In [97]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.30, random_state = 42)
In [98]:
lr = LinearRegression()
lr.fit(x_train,y_train)
lr.score(x_test,y_test)
Out[98]:
0.8783303966353667
In [99]:
predict= lr.predict(x)
y_pred = lr.predict(x_test)
print(predict)
[[  3.17348135 111.64814472 288.82446478]
 [  2.78711689 103.85687253 231.35508484]
 [  2.80463596 113.43480914 155.66451163]
 ...
 [  3.09676468 109.08900584 383.82767278]
 [  3.03214566 112.87501141 335.0666682 ]
 [  3.23093203 117.55282217 373.72525041]]
In [100]:
pred_df = pd.DataFrame(predict, columns=['Predicted_CO', 'Predicted_NO2', 'Predicted_NMHC'])
pred_df.head()
Out[100]:
Predicted_CO Predicted_NO2 Predicted_NMHC
0 3.173481 111.648145 288.824465
1 2.787117 103.856873 231.355085
2 2.804636 113.434809 155.664512
3 2.744741 112.419620 164.052277
4 2.139227 97.800635 94.988708
In [101]:
plt.figure(figsize=(18, 5))

for i, col in enumerate(y.columns):
    plt.subplot(1, 3, i+1)
    plt.scatter(y[col], pred_df.iloc[:, i], alpha=0.5, color="blue")
    plt.plot([y[col].min(), y[col].max()], 
             [y[col].min(), y[col].max()], "r--", linewidth=2)
    plt.xlabel(f"Actual {col}")
    plt.ylabel(f"Predicted {col}")
    plt.title(f"Y vs Predicted - {col}")


plt.show()
No description has been provided for this image
In [102]:
results = {"Pollutant": [], "Model": [], "RMSE": [], "R2": []}
In [103]:
y_pred_lin = lr.predict(x_test)   # predict only on test data
# Linear Regression metrics
mse_lin = mean_squared_error(y_test.iloc[:, i], y_pred_lin[:, i])
rmse_lin = np.sqrt(mse_lin)
r2_lin = r2_score(y_test.iloc[:, i], y_pred_lin[:, i])
results["Pollutant"].append(col)
results["Model"].append("Linear Regression")
results["RMSE"].append(rmse_lin)
results["R2"].append(r2_lin)
In [104]:
for i,col in enumerate(y.columns):
    plt.figure(figsize= (7,5))
    plt.scatter(y_test.iloc[:,i],y_pred_lin[:,i],alpha=0.5,color = "green")
    plt.plot([y_test.iloc[:, i].min(), y_test.iloc[:, i].max()],
             [y_test.iloc[:, i].min(), y_test.iloc[:, i].max()],
             "r--", linewidth=2)
    plt.xlabel(f"Actual {col}")
    plt.ylabel(f"Predicted {col}")
    plt.title(f"Y_test vs Predicted {col}")
    plt.show()
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [105]:
df = pd.concat([df.reset_index(drop=True), pred_df], axis=1)
In [106]:
df.head()
Out[106]:
Date Time CO(GT) PT08.S1(CO) NMHC(GT) C6H6(GT) PT08.S2(NMHC) NOx(GT) PT08.S3(NOx) NO2(GT) PT08.S4(NO2) PT08.S5(O3) T RH AH Predicted_CO Predicted_NO2 Predicted_NMHC
0 10/03/2004 18.00.00 2.6 1360.0 150.0 11.9 1046.0 166.0 1056.0 113.0 1692.0 1268.0 13.6 48.9 0.7578 3.173481 111.648145 288.824465
1 10/03/2004 19.00.00 2.0 1292.0 112.0 9.4 955.0 103.0 1174.0 92.0 1559.0 972.0 13.3 47.7 0.7255 2.787117 103.856873 231.355085
2 10/03/2004 20.00.00 2.2 1402.0 88.0 9.0 939.0 131.0 1140.0 114.0 1555.0 1074.0 11.9 54.0 0.7502 2.804636 113.434809 155.664512
3 10/03/2004 21.00.00 2.2 1376.0 80.0 9.2 948.0 172.0 1092.0 122.0 1584.0 1203.0 11.0 60.0 0.7867 2.744741 112.419620 164.052277
4 10/03/2004 22.00.00 1.6 1272.0 51.0 6.5 836.0 131.0 1205.0 116.0 1490.0 1110.0 11.2 59.6 0.7888 2.139227 97.800635 94.988708
In [107]:
knn_model = KNeighborsRegressor()
knn_model.fit(x_train,y_train)
print("Score : ",knn_model.score(x_test,y_test))
y_pred_knn = knn_model.predict(x_test)
Score :  0.8832258223309667
In [108]:
for i, col in enumerate(y.columns):
    mse_knn = mean_squared_error(y_test.iloc[:, i], y_pred_knn[:, i])
    rmse_knn = np.sqrt(mse_knn)
    r2_knn = r2_score(y_test.iloc[:, i], y_pred_knn[:, i])
    results["Pollutant"].append(col)
    results["Model"].append("KNN Regression (k=5)")
    results["RMSE"].append(rmse_knn)
    results["R2"].append(r2_knn)
results_df = pd.DataFrame(results)
print(results_df)
  Pollutant                 Model       RMSE        R2
0  NMHC(GT)     Linear Regression  88.451992  0.816715
1    CO(GT)  KNN Regression (k=5)   0.324897  0.944814
2   NO2(GT)  KNN Regression (k=5)  13.101629  0.833054
3  NMHC(GT)  KNN Regression (k=5)  73.972902  0.871809
In [ ]:
plt.figure(figsize=(8,5))
for pollutant in y.columns:
    subset = results_df[results_df["Pollutant"] == pollutant]
    plt.bar(subset["Model"], subset["RMSE"], label=pollutant)
plt.title("RMSE Comparison: Linear vs KNN Regression")
plt.ylabel("RMSE")
plt.show()
No description has been provided for this image
In [110]:
# Scatter plot: Actual vs Predicted for CO (example)
plt.figure(figsize=(6,6))
plt.scatter(y_test["CO(GT)"], y_pred_lin[:,0], alpha=0.6, label="Linear Regression")
plt.scatter(y_test["CO(GT)"], y_pred_knn[:,0], alpha=0.6, label="KNN Regression")
plt.plot([y_test["CO(GT)"].min(), y_test["CO(GT)"].max()],
         [y_test["CO(GT)"].min(), y_test["CO(GT)"].max()],
         "r--")  # Perfect prediction line
plt.title("CO(GT): Actual vs Predicted")
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.legend()
plt.show()
No description has been provided for this image
In [111]:
results = []

for i, col in enumerate(y.columns):
    # Linear Regression
    mse_lin = mean_squared_error(y_test.iloc[:, i], y_pred_lin[:, i])
    rmse_lin = np.sqrt(mse_lin)
    r2_lin = r2_score(y_test.iloc[:, i], y_pred_lin[:, i])
    
    # KNN Regression
    mse_knn = mean_squared_error(y_test.iloc[:, i], y_pred_knn[:, i])
    rmse_knn = np.sqrt(mse_knn)
    r2_knn = r2_score(y_test.iloc[:, i], y_pred_knn[:, i])
    
    results.append({
        "Pollutant": col,
        "Linear RMSE": rmse_lin,
        "Linear R²": r2_lin,
        "KNN RMSE": rmse_knn,
        "KNN R²": r2_knn
    })

    results_df = pd.DataFrame(results)
print("\n🔹 Model Comparison Results:")
print(results_df)
🔹 Model Comparison Results:
  Pollutant  Linear RMSE  Linear R²   KNN RMSE    KNN R²
0    CO(GT)     0.291171   0.955677   0.324897  0.944814
1   NO2(GT)    11.885894   0.862599  13.101629  0.833054
2  NMHC(GT)    88.451992   0.816715  73.972902  0.871809
In [112]:
results_df.plot(
    x="Pollutant",
    y=["Linear RMSE", "KNN RMSE"],
    kind="bar",
    figsize=(10, 6)
)
plt.title("RMSE Comparison: Linear vs KNN Regression")
plt.ylabel("RMSE (Lower = Better)")
plt.xticks(rotation=45)
plt.show()
No description has been provided for this image
In [113]:
pollutant = y.columns[0]

plt.figure(figsize=(12,5))
plt.subplot(1, 2, 1)
plt.scatter(y_test.iloc[:,0], y_pred_lin[:,0], alpha=0.6, color='blue')
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title(f"Linear Regression: {pollutant}")

plt.subplot(1, 2, 2)
plt.scatter(y_test.iloc[:,0], y_pred_knn[:,0], alpha=0.6, color='green')
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title(f"KNN Regression: {pollutant}")

plt.tight_layout()
plt.show()
No description has been provided for this image